Blenda Hoang, Alexander Chin, Billy Marin, Henry Raymond, Shuang (Penny) Peng
%autosave 300
Autosaving every 300 seconds
#Print all the outputs in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np
import seaborn as sns
%pylab inline
import sklearn as sk
import sklearn.tree
from IPython.display import Image
import pydotplus
import plotly.express as px
import sklearn.tree as tree
#Suppress Pandas future warning
import warnings
warnings.filterwarnings('ignore')
Populating the interactive namespace from numpy and matplotlib
pd.set_option('display.max_columns',None)
df = pd.read_csv('Telco_Data_V3.csv')
df.head()
| LoyaltyID | Customer ID | Senior Citizen | Partner | Dependents | Tenure | Phone Service | Multiple Lines | Internet Service | Online Security | Online Backup | Device Protection | Tech Support | Streaming TV | Streaming Movies | Contract | Paperless Billing | Payment Method | Monthly Charges | Total Charges | Gender | Age | Under 30 | Senior Citizen.1 | Married | Dependents.1 | Number of Dependents | City | Zip Code | Latitude | Longitude | Population | Total Revenue | Satisfaction Score | Customer Status | Churn Value | Churn Score | Churn Category | Churn Reason | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 318537 | 7590-VHVEG | No | Yes | No | 1 | No | No phone service | DSL | No | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | Female | 36 | No | No | Yes | No | 0 | Los Angeles | 90001 | 33.973616 | -118.249020 | 54492 | 29.85 | 3 | Joined | 0 | 27 | 0 | 0 |
| 1 | 152148 | 5575-GNVDE | No | No | No | 34 | Yes | No | DSL | Yes | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | Male | 46 | No | No | No | No | 0 | Los Angeles | 90002 | 33.949255 | -118.246978 | 44586 | 2470.56 | 3 | Stayed | 0 | 61 | 0 | 0 |
| 2 | 326527 | 3668-QPYBK | No | No | No | 2 | Yes | No | DSL | Yes | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Male | 37 | No | No | No | No | 0 | Los Angeles | 90003 | 33.964131 | -118.272783 | 58198 | 129.09 | 1 | Churned | 1 | 86 | Competitor | Competitor made better offer |
| 3 | 845894 | 7795-CFOCW | No | No | No | 45 | No | No phone service | DSL | Yes | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | Male | 53 | No | No | No | No | 0 | Los Angeles | 90004 | 34.076259 | -118.310715 | 67852 | 1840.75 | 3 | Stayed | 0 | 66 | 0 | 0 |
| 4 | 503388 | 9237-HQITU | No | No | No | 2 | Yes | No | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Female | 19 | Yes | No | No | Yes | 2 | Los Angeles | 90005 | 34.059281 | -118.307420 | 43019 | 169.89 | 2 | Churned | 1 | 67 | Other | Moved |
df.corr()
| LoyaltyID | Tenure | Monthly Charges | Age | Number of Dependents | Zip Code | Latitude | Longitude | Population | Total Revenue | Satisfaction Score | Churn Value | Churn Score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| LoyaltyID | 1.000000 | -0.003633 | -0.006984 | -0.018213 | 0.005464 | -0.022660 | -0.030903 | 0.027215 | 0.020175 | -0.007301 | -0.000186 | 0.009513 | 0.013237 |
| Tenure | -0.003633 | 1.000000 | 0.247900 | 0.010202 | 0.107689 | 0.007362 | 0.012075 | -0.009716 | -0.012388 | 0.853078 | 0.210056 | -0.352229 | -0.224523 |
| Monthly Charges | -0.006984 | 0.247900 | 1.000000 | 0.144685 | -0.131837 | -0.010775 | -0.030325 | 0.028429 | 0.009597 | 0.588887 | -0.223922 | 0.193356 | 0.132115 |
| Age | -0.018213 | 0.010202 | 0.144685 | 1.000000 | -0.119000 | -0.008183 | -0.010305 | 0.007612 | -0.018881 | 0.048265 | -0.085170 | 0.115760 | 0.083919 |
| Number of Dependents | 0.005464 | 0.107689 | -0.131837 | -0.119000 | 1.000000 | 0.016493 | 0.029081 | -0.024271 | -0.015486 | 0.038038 | 0.152873 | -0.218780 | -0.160743 |
| Zip Code | -0.022660 | 0.007362 | -0.010775 | -0.008183 | 0.016493 | 1.000000 | 0.894769 | -0.790564 | -0.424067 | 0.002944 | 0.002121 | -0.016289 | -0.018020 |
| Latitude | -0.030903 | 0.012075 | -0.030325 | -0.010305 | 0.029081 | 0.894769 | 1.000000 | -0.885979 | -0.434801 | 0.000082 | 0.022367 | -0.041546 | -0.037615 |
| Longitude | 0.027215 | -0.009716 | 0.028429 | 0.007612 | -0.024271 | -0.790564 | -0.885979 | 1.000000 | 0.296288 | 0.001062 | -0.009048 | 0.024052 | 0.019394 |
| Population | 0.020175 | -0.012388 | 0.009597 | -0.018881 | -0.015486 | -0.424067 | -0.434801 | 0.296288 | 1.000000 | -0.010546 | -0.031056 | 0.051649 | 0.020012 |
| Total Revenue | -0.007301 | 0.853078 | 0.588887 | 0.048265 | 0.038038 | 0.002944 | 0.000082 | 0.001062 | -0.010546 | 1.000000 | 0.094796 | -0.223003 | -0.138072 |
| Satisfaction Score | -0.000186 | 0.210056 | -0.223922 | -0.085170 | 0.152873 | 0.002121 | 0.022367 | -0.009048 | -0.031056 | 0.094796 | 1.000000 | -0.754649 | -0.495144 |
| Churn Value | 0.009513 | -0.352229 | 0.193356 | 0.115760 | -0.218780 | -0.016289 | -0.041546 | 0.024052 | 0.051649 | -0.223003 | -0.754649 | 1.000000 | 0.660772 |
| Churn Score | 0.013237 | -0.224523 | 0.132115 | 0.083919 | -0.160743 | -0.018020 | -0.037615 | 0.019394 | 0.020012 | -0.138072 | -0.495144 | 0.660772 | 1.000000 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 LoyaltyID 7043 non-null int64 1 Customer ID 7043 non-null object 2 Senior Citizen 7043 non-null object 3 Partner 7043 non-null object 4 Dependents 7043 non-null object 5 Tenure 7043 non-null int64 6 Phone Service 7043 non-null object 7 Multiple Lines 7043 non-null object 8 Internet Service 7043 non-null object 9 Online Security 7043 non-null object 10 Online Backup 7043 non-null object 11 Device Protection 7043 non-null object 12 Tech Support 7043 non-null object 13 Streaming TV 7043 non-null object 14 Streaming Movies 7043 non-null object 15 Contract 7043 non-null object 16 Paperless Billing 7043 non-null object 17 Payment Method 7043 non-null object 18 Monthly Charges 7043 non-null float64 19 Total Charges 7043 non-null object 20 Gender 7043 non-null object 21 Age 7043 non-null int64 22 Under 30 7043 non-null object 23 Senior Citizen.1 7043 non-null object 24 Married 7043 non-null object 25 Dependents.1 7043 non-null object 26 Number of Dependents 7043 non-null int64 27 City 7043 non-null object 28 Zip Code 7043 non-null int64 29 Latitude 7043 non-null float64 30 Longitude 7043 non-null float64 31 Population 7043 non-null int64 32 Total Revenue 7043 non-null float64 33 Satisfaction Score 7043 non-null int64 34 Customer Status 7043 non-null object 35 Churn Value 7043 non-null int64 36 Churn Score 7043 non-null int64 37 Churn Category 7043 non-null object 38 Churn Reason 7043 non-null object dtypes: float64(4), int64(9), object(26) memory usage: 2.1+ MB
df1 = df.copy()
#Replacing strings with numbers 1 or 0
df1.replace({'Yes':1,'No':0,'No phone service':0,'No internet service':0},inplace = True)
#Drop duplicate columns or columns that are not useful for our analysis
df1.drop(columns = ['LoyaltyID','Customer ID','Senior Citizen.1','Married','Dependents.1','Customer Status'],inplace = True)
df1.head(1)
| Senior Citizen | Partner | Dependents | Tenure | Phone Service | Multiple Lines | Internet Service | Online Security | Online Backup | Device Protection | Tech Support | Streaming TV | Streaming Movies | Contract | Paperless Billing | Payment Method | Monthly Charges | Total Charges | Gender | Age | Under 30 | Number of Dependents | City | Zip Code | Latitude | Longitude | Population | Total Revenue | Satisfaction Score | Churn Value | Churn Score | Churn Category | Churn Reason | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 0 | 1 | 0 | 0 | DSL | 0 | 1 | 0 | 0 | 0 | 0 | Month-to-month | 1 | Electronic check | 29.85 | 29.85 | Female | 36 | 0 | 0 | Los Angeles | 90001 | 33.973616 | -118.24902 | 54492 | 29.85 | 3 | 0 | 27 | 0 | 0 |
df1.isna().sum()
Senior Citizen 0 Partner 0 Dependents 0 Tenure 0 Phone Service 0 Multiple Lines 0 Internet Service 0 Online Security 0 Online Backup 0 Device Protection 0 Tech Support 0 Streaming TV 0 Streaming Movies 0 Contract 0 Paperless Billing 0 Payment Method 0 Monthly Charges 0 Total Charges 0 Gender 0 Age 0 Under 30 0 Number of Dependents 0 City 0 Zip Code 0 Latitude 0 Longitude 0 Population 0 Total Revenue 0 Satisfaction Score 0 Churn Value 0 Churn Score 0 Churn Category 0 Churn Reason 0 dtype: int64
df1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 33 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Senior Citizen 7043 non-null int64 1 Partner 7043 non-null int64 2 Dependents 7043 non-null int64 3 Tenure 7043 non-null int64 4 Phone Service 7043 non-null int64 5 Multiple Lines 7043 non-null int64 6 Internet Service 7043 non-null object 7 Online Security 7043 non-null int64 8 Online Backup 7043 non-null int64 9 Device Protection 7043 non-null int64 10 Tech Support 7043 non-null int64 11 Streaming TV 7043 non-null int64 12 Streaming Movies 7043 non-null int64 13 Contract 7043 non-null object 14 Paperless Billing 7043 non-null int64 15 Payment Method 7043 non-null object 16 Monthly Charges 7043 non-null float64 17 Total Charges 7043 non-null object 18 Gender 7043 non-null object 19 Age 7043 non-null int64 20 Under 30 7043 non-null int64 21 Number of Dependents 7043 non-null int64 22 City 7043 non-null object 23 Zip Code 7043 non-null int64 24 Latitude 7043 non-null float64 25 Longitude 7043 non-null float64 26 Population 7043 non-null int64 27 Total Revenue 7043 non-null float64 28 Satisfaction Score 7043 non-null int64 29 Churn Value 7043 non-null int64 30 Churn Score 7043 non-null int64 31 Churn Category 7043 non-null object 32 Churn Reason 7043 non-null object dtypes: float64(4), int64(21), object(8) memory usage: 1.8+ MB
#Convert object type to numeric value
df1['Total Charges'] = pd.to_numeric(df1['Total Charges'], errors='coerce')
#Calculate the churn rate based on each zip code
df_churn_rate = pd.DataFrame(df1.groupby('Zip Code')['Churn Value'].mean()).\
rename(columns = {'Churn Value':'churn_rate'})
#Df_churn_rate.head(1)
#Merging in zip code so we can view churn rate by customer by zip code
df1 = df1.merge(df_churn_rate, left_on = 'Zip Code', right_on= 'Zip Code',how='left')
#df1.head(1)
df1 = pd.get_dummies(df1, columns=['Internet Service','Contract'],dummy_na=False)
df2 = df1.copy()
df2 = pd.get_dummies(df2, columns=['Payment Method','Gender','Churn Reason'])
df_churn = df2[df2['Churn Value'] == 1]
#df_churn.head(1)
df3 = df1.copy()
df3 = pd.get_dummies(df3, columns=['City','Payment Method','Gender'])
df1.head(1)
| Senior Citizen | Partner | Dependents | Tenure | Phone Service | Multiple Lines | Online Security | Online Backup | Device Protection | Tech Support | Streaming TV | Streaming Movies | Paperless Billing | Payment Method | Monthly Charges | Total Charges | Gender | Age | Under 30 | Number of Dependents | City | Zip Code | Latitude | Longitude | Population | Total Revenue | Satisfaction Score | Churn Value | Churn Score | Churn Category | Churn Reason | churn_rate | Internet Service_0 | Internet Service_DSL | Internet Service_Fiber optic | Contract_Month-to-month | Contract_One year | Contract_Two year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | Electronic check | 29.85 | 29.85 | Female | 36 | 0 | 0 | Los Angeles | 90001 | 33.973616 | -118.24902 | 54492 | 29.85 | 3 | 0 | 27 | 0 | 0 | 0.25 | 0 | 1 | 0 | 1 | 0 | 0 |
#Dropping N/As from our dataset given small value
df3.isna().any().sum()
1
df3.dropna(inplace=True)
len(df3)
7032
https://community.ibm.com/accelerators/?context=analytics&query=telco%20customer&type=Data
This dataset was sourced from IBM and contains customer information from a Telecommunications company (Telco) that provides a variety of phone and internet services in California. Each row represents a customer and the columns provide different attributes of each customer such as demographic information, the type of services they are enrolled in, and whether or not they churned (i.e. left the company).
**Variables of Consideration**
dt = tree.DecisionTreeClassifier(max_depth=2)
#Removing all highly correlated columns to 'Churn Value'
X = df3.drop(columns=['Churn Value','Satisfaction Score','Churn Reason',\
'Churn Category','Churn Score','churn_rate'])
Y = df3['Churn Value']
dt.fit(X,Y)
DecisionTreeClassifier(max_depth=2)
dt_feature_names = list(X.columns)
dt_target_names = [str(s) for s in Y.unique()]
tree.export_graphviz(dt, out_file='tree.dot',
feature_names=dt_feature_names, class_names=dt_target_names,
filled=True)
graph = pydotplus.graph_from_dot_file('tree.dot')
Image(graph.create_png())
#Churn by Internet Service Type which indicates Fiber Optic has definitively higher churn than DSL.
sns.countplot(x='Churn Value',hue='Internet Service',data=df)
<AxesSubplot:xlabel='Churn Value', ylabel='count'>
#Churn by Contract Type which indicates Month-to-Month contracts have a definitively higher churn rate than longer term contracts.
sns.countplot(x='Churn Value',hue='Contract',data=df)
<AxesSubplot:xlabel='Churn Value', ylabel='count'>
df.head(1)
| LoyaltyID | Customer ID | Senior Citizen | Partner | Dependents | Tenure | Phone Service | Multiple Lines | Internet Service | Online Security | Online Backup | Device Protection | Tech Support | Streaming TV | Streaming Movies | Contract | Paperless Billing | Payment Method | Monthly Charges | Total Charges | Gender | Age | Under 30 | Senior Citizen.1 | Married | Dependents.1 | Number of Dependents | City | Zip Code | Latitude | Longitude | Population | Total Revenue | Satisfaction Score | Customer Status | Churn Value | Churn Score | Churn Category | Churn Reason | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 318537 | 7590-VHVEG | No | Yes | No | 1 | No | No phone service | DSL | No | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | Female | 36 | No | No | Yes | No | 0 | Los Angeles | 90001 | 33.973616 | -118.24902 | 54492 | 29.85 | 3 | Joined | 0 | 27 | 0 | 0 |
df4=df2.copy()
#Group by Churn Value, Month-to-Month contracts, Monthly Charges, and Satisfaction Score which confirm the results
#of our original decision tree.
df4.groupby(['Churn Value','Contract_Month-to-month']).agg({'Internet Service_Fiber optic':'mean',\
'Monthly Charges':'mean','Satisfaction Score':'mean'}).style.background_gradient(cmap = 'Blues')
| Internet Service_Fiber optic | Monthly Charges | Satisfaction Score | ||
|---|---|---|---|---|
| Churn Value | Contract_Month-to-month | |||
| 0 | 0 | 0.281991 | 61.116689 | 3.746445 |
| 1 | 0.435135 | 61.462635 | 3.847748 | |
| 1 | 0 | 0.630841 | 85.438084 | 1.906542 |
| 1 | 0.702115 | 73.019396 | 1.714199 |
#Groupby of Churn Value which shows of those customers who churned, a greater proportion of them
#were Fiber Optic Customers, had month-to-month contracts, and a lower satisfaction score.
df4.groupby('Churn Value').agg({'Contract_Month-to-month':'mean','Internet Service_Fiber optic':'mean','Monthly Charges':'mean',\
'Satisfaction Score':'mean'})
| Contract_Month-to-month | Internet Service_Fiber optic | Monthly Charges | Satisfaction Score | |
|---|---|---|---|---|
| Churn Value | ||||
| 0 | 0.429068 | 0.347700 | 61.265124 | 3.789911 |
| 1 | 0.885500 | 0.693954 | 74.441332 | 1.736223 |
#The average satisfaction score is consistently lower for customers who have monthly charges above $94/month.
#Almost the entirety of customers who pay more than $94/month subscribe to Fiber Optic.
df4['Mcharge_bin'] = pd.cut(df4['Monthly Charges'], bins=[0,93.68,119])
df4.groupby(['Churn Value','Contract_Month-to-month','Mcharge_bin']).agg({'Internet Service_Fiber optic':'mean',\
'Monthly Charges':'mean','Satisfaction Score':'mean'}).style.background_gradient(cmap = 'Blues')
| Internet Service_Fiber optic | Monthly Charges | Satisfaction Score | |||
|---|---|---|---|---|---|
| Churn Value | Contract_Month-to-month | Mcharge_bin | |||
| 0 | 0 | (0.0, 93.68] | 0.076253 | 48.461917 | 3.765577 |
| (93.68, 119.0] | 0.998483 | 105.187557 | 3.679818 | ||
| 1 | (0.0, 93.68] | 0.340000 | 54.865868 | 3.859474 | |
| (93.68, 119.0] | 1.000000 | 100.630937 | 3.778125 | ||
| 1 | 0 | (0.0, 93.68] | 0.168421 | 61.501053 | 2.010526 |
| (93.68, 119.0] | 1.000000 | 104.547479 | 1.823529 | ||
| 1 | (0.0, 93.68] | 0.614241 | 65.150469 | 1.741002 | |
| (93.68, 119.0] | 1.000000 | 99.694430 | 1.623342 |
#Group Churn Reasons with highest on top
df1[df1['Churn Reason']!='0'].groupby('Churn Reason').agg({'Churn Value':'size'}).\
nlargest(20, 'Churn Value').style.background_gradient(cmap = 'Blues')
| Churn Value | |
|---|---|
| Churn Reason | |
| Competitor had better devices | 313 |
| Competitor made better offer | 311 |
| Attitude of support person | 220 |
| Don't know | 130 |
| Competitor offered more data | 117 |
| Competitor offered higher download speeds | 100 |
| Attitude of service provider | 94 |
| Price too high | 78 |
| Product dissatisfaction | 77 |
| Network reliability | 72 |
| Long distance charges | 64 |
| Service dissatisfaction | 63 |
| Moved | 46 |
| Extra data charges | 39 |
| Limited range of services | 37 |
| Poor expertise of online support | 31 |
| Lack of affordable download/upload speed | 30 |
| Lack of self-service on Website | 29 |
| Poor expertise of phone support | 12 |
| Deceased | 6 |
#Drop the columns that have similar information as other columns; location information (since we can eyeball location from the map);
#and columns that are highly correlated with churn rate.
X= df_churn.drop(columns = ['Paperless Billing','Total Charges','Under 30','Latitude','Longitude','City','Zip Code',\
'Satisfaction Score','Churn Value','Churn Score','Churn Category','churn_rate'])
Y = df_churn['churn_rate']
dt = sklearn.tree.DecisionTreeRegressor(max_depth = 2)
dt.fit(X,Y)
dt_feature_names = list(X.columns)
dt_target_names = np.array(Y.unique(),dtype=np.string_)
tree.export_graphviz(dt, out_file='tree.dot',
feature_names=dt_feature_names, class_names=dt_target_names,
filled=True)
graph = pydotplus.graph_from_dot_file('tree.dot')
Image(graph.create_png())
DecisionTreeRegressor(max_depth=2)
#Plot churn rates by population size bin to confirm Decision Tree results.
df_churn['Population_bin'] = pd.cut(df_churn['Population'],bins=[0,27249,110000])
df_churn.groupby('Population_bin')['churn_rate'].mean().plot(kind='bar')
<AxesSubplot:xlabel='Population_bin'>
#Groupby to get zip codes with highest churn rates
ChurnBy_zip1 = df1.groupby(['Zip Code']).agg({'Churn Value':'size','Latitude':'mean',\
'Longitude':'mean','churn_rate':'mean',})
ChurnBy_zip2 = df1.groupby(['Zip Code']).agg({'Churn Value':'size','Latitude':'mean',\
'Longitude':'mean','churn_rate':'mean',}).nlargest(40, 'churn_rate')
#Map by highest churn quantity by Zip Code, color by churn rate and size by quantity of customers that churned
import plotly.express as px
px.set_mapbox_access_token ='pk.eyJ1IjoiYm1hcmluMiIsImEiOiJja3dicGplOTIydm54Mm9tbnQ2cWFwejV6In0.4VaP0Cmn09tQ07XOlwI_ZQ'
fig = px.scatter_mapbox(data_frame=ChurnBy_zip1,
lat="Latitude",
lon="Longitude",
color="churn_rate",
size = "Churn Value",
color_continuous_scale='rdpu')
fig.update_layout(mapbox_style="carto-positron",\
mapbox_zoom=4.4, mapbox_center = {"lat": 37.5, "lon": -117})
#Map with top churn ratios by Zip Code, color by churn rate and size by count of customers
px.set_mapbox_access_token ='pk.eyJ1IjoiYm1hcmluMiIsImEiOiJja3dicGplOTIydm54Mm9tbnQ2cWFwejV6In0.4VaP0Cmn09tQ07XOlwI_ZQ'
fig = px.scatter_mapbox(data_frame=ChurnBy_zip2,
lat="Latitude",
lon="Longitude",
color="churn_rate",
size = "Churn Value",
color_continuous_scale='turbo')
fig.update_layout(mapbox_style="carto-positron",\
mapbox_zoom=4.4, mapbox_center = {"lat": 37.5, "lon": -117})
df1['geo_bin'] = pd.cut(df1.Latitude, bins=[30,33,35,37,42])
sns.catplot(hue='Churn Reason',y='geo_bin', data=df1[df1['Churn Category']!='0'],\
palette="Set2", kind='count',aspect=2)
<seaborn.axisgrid.FacetGrid at 0x7fb0fbe389a0>
pd.set_option('display.max_rows', 80)
df1[df1['Churn Reason']!='0'].groupby(['geo_bin','Churn Reason']).agg({'Churn Value':'size',\
'Internet Service_Fiber optic':'mean','Senior Citizen':'mean',\
'Contract_Month-to-month':'mean', 'Tenure':'mean'}).sort_values(ascending=False,\
by=['geo_bin','Churn Value']).rename(columns={'Customer ID':'# Churned',\
'Internet Service_Fiber optic':'Fiber %',\
'Senior Citizen':'Senior %','Contract_Month-to-month':'Contract_M2M %'})
| Churn Value | Fiber % | Senior % | Contract_M2M % | Tenure | ||
|---|---|---|---|---|---|---|
| geo_bin | Churn Reason | |||||
| (37, 42] | Competitor had better devices | 151 | 0.768212 | 0.304636 | 0.933775 | 14.841060 |
| Competitor offered more data | 73 | 0.794521 | 0.164384 | 0.835616 | 25.095890 | |
| Attitude of support person | 69 | 0.681159 | 0.463768 | 0.942029 | 18.507246 | |
| Competitor made better offer | 67 | 0.731343 | 0.328358 | 0.925373 | 14.298507 | |
| Competitor offered higher download speeds | 57 | 0.719298 | 0.087719 | 0.842105 | 20.614035 | |
| Don't know | 52 | 0.750000 | 0.288462 | 0.884615 | 19.153846 | |
| Price too high | 43 | 0.651163 | 0.279070 | 0.813953 | 22.279070 | |
| Network reliability | 39 | 0.564103 | 0.256410 | 0.871795 | 17.282051 | |
| Product dissatisfaction | 34 | 0.705882 | 0.352941 | 0.764706 | 23.911765 | |
| Service dissatisfaction | 34 | 0.735294 | 0.176471 | 0.941176 | 20.735294 | |
| Long distance charges | 33 | 0.393939 | 0.060606 | 0.818182 | 13.060606 | |
| Attitude of service provider | 30 | 0.633333 | 0.266667 | 0.866667 | 20.333333 | |
| Moved | 22 | 0.681818 | 0.090909 | 0.818182 | 16.772727 | |
| Limited range of services | 19 | 0.736842 | 0.263158 | 0.894737 | 13.789474 | |
| Extra data charges | 18 | 0.722222 | 0.277778 | 1.000000 | 21.722222 | |
| Poor expertise of online support | 17 | 0.705882 | 0.294118 | 0.823529 | 15.411765 | |
| Lack of affordable download/upload speed | 14 | 0.785714 | 0.142857 | 0.857143 | 21.285714 | |
| Lack of self-service on Website | 13 | 0.692308 | 0.000000 | 0.923077 | 14.000000 | |
| Poor expertise of phone support | 7 | 0.571429 | 0.000000 | 0.714286 | 15.571429 | |
| Deceased | 3 | 0.666667 | 0.333333 | 1.000000 | 9.333333 | |
| (35, 37] | Competitor had better devices | 35 | 0.742857 | 0.200000 | 0.942857 | 12.771429 |
| Attitude of support person | 28 | 0.607143 | 0.250000 | 0.892857 | 14.642857 | |
| Attitude of service provider | 25 | 0.720000 | 0.160000 | 0.760000 | 19.520000 | |
| Competitor made better offer | 12 | 0.666667 | 0.500000 | 0.833333 | 11.833333 | |
| Don't know | 11 | 0.818182 | 0.090909 | 1.000000 | 14.636364 | |
| Competitor offered higher download speeds | 9 | 0.666667 | 0.222222 | 0.777778 | 17.222222 | |
| Product dissatisfaction | 7 | 0.571429 | 0.428571 | 0.857143 | 25.142857 | |
| Competitor offered more data | 6 | 1.000000 | 0.166667 | 0.833333 | 14.666667 | |
| Long distance charges | 6 | 0.333333 | 0.000000 | 0.833333 | 19.000000 | |
| Moved | 5 | 1.000000 | 0.400000 | 0.600000 | 36.200000 | |
| Price too high | 5 | 0.800000 | 0.800000 | 1.000000 | 21.000000 | |
| Service dissatisfaction | 4 | 1.000000 | 0.250000 | 0.750000 | 22.250000 | |
| Extra data charges | 3 | 1.000000 | 0.333333 | 1.000000 | 12.000000 | |
| Lack of affordable download/upload speed | 3 | 0.666667 | 0.333333 | 0.666667 | 19.000000 | |
| Lack of self-service on Website | 3 | 0.333333 | 0.000000 | 1.000000 | 24.333333 | |
| Network reliability | 3 | 0.666667 | 0.000000 | 1.000000 | 9.666667 | |
| Poor expertise of online support | 3 | 1.000000 | 0.666667 | 1.000000 | 15.000000 | |
| Poor expertise of phone support | 2 | 0.500000 | 0.500000 | 1.000000 | 8.500000 | |
| Limited range of services | 1 | 0.000000 | 0.000000 | 1.000000 | 13.000000 | |
| Deceased | 0 | NaN | NaN | NaN | NaN | |
| (33, 35] | Attitude of support person | 113 | 0.628319 | 0.194690 | 0.929204 | 16.681416 |
| Competitor had better devices | 113 | 0.743363 | 0.230088 | 0.893805 | 18.309735 | |
| Competitor made better offer | 84 | 0.726190 | 0.297619 | 0.892857 | 17.011905 | |
| Don't know | 50 | 0.520000 | 0.140000 | 0.900000 | 16.000000 | |
| Attitude of service provider | 35 | 0.771429 | 0.314286 | 0.914286 | 12.000000 | |
| Competitor offered more data | 34 | 0.764706 | 0.352941 | 0.852941 | 20.882353 | |
| Product dissatisfaction | 31 | 0.709677 | 0.225806 | 0.838710 | 21.935484 | |
| Competitor offered higher download speeds | 30 | 0.566667 | 0.166667 | 0.733333 | 22.666667 | |
| Network reliability | 29 | 0.793103 | 0.275862 | 0.896552 | 16.241379 | |
| Price too high | 29 | 0.724138 | 0.344828 | 0.965517 | 18.482759 | |
| Service dissatisfaction | 25 | 0.600000 | 0.320000 | 0.960000 | 14.200000 | |
| Long distance charges | 24 | 0.500000 | 0.333333 | 0.833333 | 18.791667 | |
| Moved | 19 | 0.736842 | 0.210526 | 0.842105 | 17.157895 | |
| Extra data charges | 18 | 0.666667 | 0.333333 | 0.833333 | 26.277778 | |
| Limited range of services | 17 | 0.647059 | 0.588235 | 0.941176 | 11.117647 | |
| Lack of affordable download/upload speed | 13 | 0.538462 | 0.384615 | 0.769231 | 19.769231 | |
| Lack of self-service on Website | 11 | 0.454545 | 0.000000 | 1.000000 | 13.818182 | |
| Poor expertise of online support | 8 | 0.500000 | 0.125000 | 1.000000 | 17.375000 | |
| Deceased | 3 | 0.666667 | 0.333333 | 1.000000 | 8.333333 | |
| Poor expertise of phone support | 3 | 0.666667 | 0.000000 | 0.666667 | 13.333333 | |
| (30, 33] | Competitor made better offer | 148 | 0.736486 | 0.250000 | 0.898649 | 17.500000 |
| Don't know | 17 | 0.647059 | 0.176471 | 0.823529 | 25.647059 | |
| Competitor had better devices | 14 | 0.642857 | 0.357143 | 0.928571 | 21.500000 | |
| Attitude of support person | 10 | 0.700000 | 0.300000 | 1.000000 | 23.700000 | |
| Product dissatisfaction | 5 | 0.800000 | 0.200000 | 0.800000 | 32.800000 | |
| Attitude of service provider | 4 | 0.750000 | 0.500000 | 0.500000 | 29.250000 | |
| Competitor offered higher download speeds | 4 | 0.500000 | 0.000000 | 1.000000 | 7.750000 | |
| Competitor offered more data | 4 | 0.750000 | 0.500000 | 1.000000 | 19.750000 | |
| Poor expertise of online support | 3 | 1.000000 | 0.666667 | 0.666667 | 30.000000 | |
| Lack of self-service on Website | 2 | 0.500000 | 0.000000 | 1.000000 | 3.500000 | |
| Long distance charges | 1 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | |
| Network reliability | 1 | 0.000000 | 0.000000 | 0.000000 | 54.000000 | |
| Price too high | 1 | 1.000000 | 0.000000 | 1.000000 | 12.000000 | |
| Deceased | 0 | NaN | NaN | NaN | NaN | |
| Extra data charges | 0 | NaN | NaN | NaN | NaN | |
| Lack of affordable download/upload speed | 0 | NaN | NaN | NaN | NaN | |
| Limited range of services | 0 | NaN | NaN | NaN | NaN | |
| Moved | 0 | NaN | NaN | NaN | NaN | |
| Poor expertise of phone support | 0 | NaN | NaN | NaN | NaN | |
| Service dissatisfaction | 0 | NaN | NaN | NaN | NaN |
pd.set_option('display.max_rows', 80)
Churn_Reason = df1[df1['Churn Reason']!='0'].groupby(['geo_bin','Churn Reason']).agg({'Churn Value':'size'})\
.sort_values(ascending=False,\
by=['geo_bin','Churn Value']).rename(columns={'Churn Value':'Churn Reason Count'})
Churn_Reason_Percent = Churn_Reason.groupby(level=0).apply(lambda x:
(x*100) / x.sum()).rename(columns={'Churn Reason Count':'Churn Reason %'})
Churn_Reason_Percent.groupby('geo_bin',as_index=False).apply(lambda x : x.sort_values(by = 'Churn Reason %', ascending = False).head(5))\
.style.background_gradient(cmap = 'Blues')
| Churn Reason % | |||
|---|---|---|---|
| geo_bin | Churn Reason | ||
| 0 | (30, 33] | Competitor made better offer | 69.158879 |
| Don't know | 7.943925 | ||
| Competitor had better devices | 6.542056 | ||
| Attitude of support person | 4.672897 | ||
| Product dissatisfaction | 2.336449 | ||
| 1 | (33, 35] | Attitude of support person | 16.400581 |
| Competitor had better devices | 16.400581 | ||
| Competitor made better offer | 12.191582 | ||
| Don't know | 7.256894 | ||
| Attitude of service provider | 5.079826 | ||
| 2 | (35, 37] | Competitor had better devices | 20.467836 |
| Attitude of support person | 16.374269 | ||
| Attitude of service provider | 14.619883 | ||
| Competitor made better offer | 7.017544 | ||
| Don't know | 6.432749 | ||
| 3 | (37, 42] | Competitor had better devices | 18.993711 |
| Competitor offered more data | 9.182390 | ||
| Attitude of support person | 8.679245 | ||
| Competitor made better offer | 8.427673 | ||
| Competitor offered higher download speeds | 7.169811 |
df1.head()
| Senior Citizen | Partner | Dependents | Tenure | Phone Service | Multiple Lines | Online Security | Online Backup | Device Protection | Tech Support | Streaming TV | Streaming Movies | Paperless Billing | Payment Method | Monthly Charges | Total Charges | Gender | Age | Under 30 | Number of Dependents | City | Zip Code | Latitude | Longitude | Population | Total Revenue | Satisfaction Score | Churn Value | Churn Score | Churn Category | Churn Reason | churn_rate | Internet Service_0 | Internet Service_DSL | Internet Service_Fiber optic | Contract_Month-to-month | Contract_One year | Contract_Two year | geo_bin | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | Electronic check | 29.85 | 29.85 | Female | 36 | 0 | 0 | Los Angeles | 90001 | 33.973616 | -118.249020 | 54492 | 29.85 | 3 | 0 | 27 | 0 | 0 | 0.25 | 0 | 1 | 0 | 1 | 0 | 0 | (33, 35] |
| 1 | 0 | 0 | 0 | 34 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | Mailed check | 56.95 | 1889.50 | Male | 46 | 0 | 0 | Los Angeles | 90002 | 33.949255 | -118.246978 | 44586 | 2470.56 | 3 | 0 | 61 | 0 | 0 | 0.00 | 0 | 1 | 0 | 0 | 1 | 0 | (33, 35] |
| 2 | 0 | 0 | 0 | 2 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | Mailed check | 53.85 | 108.15 | Male | 37 | 0 | 0 | Los Angeles | 90003 | 33.964131 | -118.272783 | 58198 | 129.09 | 1 | 1 | 86 | Competitor | Competitor made better offer | 0.20 | 0 | 1 | 0 | 1 | 0 | 0 | (33, 35] |
| 3 | 0 | 0 | 0 | 45 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | Bank transfer (automatic) | 42.30 | 1840.75 | Male | 53 | 0 | 0 | Los Angeles | 90004 | 34.076259 | -118.310715 | 67852 | 1840.75 | 3 | 0 | 66 | 0 | 0 | 0.40 | 0 | 1 | 0 | 0 | 1 | 0 | (33, 35] |
| 4 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Electronic check | 70.70 | 151.65 | Female | 19 | 1 | 2 | Los Angeles | 90005 | 34.059281 | -118.307420 | 43019 | 169.89 | 2 | 1 | 67 | Other | Moved | 0.50 | 0 | 0 | 1 | 1 | 0 | 0 | (33, 35] |
df5 = df1.copy()
#Renaming columns
df5.rename(columns=\
{'Senior Citizen':'Senior_Citizen', 'Phone Service':'Phone_Service','Multiple Lines':'Multiple_Lines',\
'Online Security':'Online_Security','Online Backup':'Online_Backup','Device Protection':'Device_Protection',\
'Tech Support':'Tech_Support','Streaming TV':'Streaming_TV','Streaming Movies':'Streaming_Movies',\
'Paperless Billing':'Paperless_Billing','Payment Method':'Payment_Method','Monthly Charges':'Monthly_Charges',\
'Total Charges':'Total_Charges','Satisfaction Score':'Satisfaction_Score','Churn Value':'Churn_Value',\
'Churn Score':'Churn_Score','Churn Category':'Churn_Category','Churn Reason':'Churn_Reason'},inplace=True)
senior_citizen_churn = df5.groupby(['Senior_Citizen','Churn_Value']).agg({'Churn_Value':'count'})
Senior_Churn_Percents = senior_citizen_churn.groupby(level=0).apply(lambda x:
(x*100) / x.sum()).rename(columns={'Churn_Value':'Churn %'})
df5.groupby(['Senior_Citizen','Churn_Value']).agg({'Churn_Value':'count'})\
.rename(columns={'Churn_Value':'Churn Count'})
Senior_Churn_Percents
| Churn Count | ||
|---|---|---|
| Senior_Citizen | Churn_Value | |
| 0 | 0 | 4508 |
| 1 | 1393 | |
| 1 | 0 | 666 |
| 1 | 476 |
| Churn % | ||
|---|---|---|
| Senior_Citizen | Churn_Value | |
| 0 | 0 | 76.393832 |
| 1 | 23.606168 | |
| 1 | 0 | 58.318739 |
| 1 | 41.681261 |
sns.countplot(x='Churn_Value', hue='Senior_Citizen', data=df5)
<AxesSubplot:xlabel='Churn_Value', ylabel='count'>
tech_support_senior = df5[df5.Senior_Citizen == 1].groupby(['Senior_Citizen','Churn_Value','Tech_Support'])\
.agg({'Churn_Value':'count'})
tech_support_senior_percents = tech_support_senior.groupby(level=0).apply(lambda x:
(x*100) / x.sum()).rename(columns={'Churn_Value':'Churn %'})
df5[df5.Senior_Citizen == 1].groupby(['Senior_Citizen','Churn_Value','Tech_Support'])\
.agg({'Churn_Value':'count'})
tech_support_senior_percents
| Churn_Value | |||
|---|---|---|---|
| Senior_Citizen | Churn_Value | Tech_Support | |
| 1 | 0 | 0 | 457 |
| 1 | 209 | ||
| 1 | 0 | 425 | |
| 1 | 51 |
| Churn % | |||
|---|---|---|---|
| Senior_Citizen | Churn_Value | Tech_Support | |
| 1 | 0 | 0 | 40.017513 |
| 1 | 18.301226 | ||
| 1 | 0 | 37.215412 | |
| 1 | 4.465849 |
sns.countplot(x='Churn_Value', hue='Tech_Support', data=df5[df5.Senior_Citizen==1])
<AxesSubplot:xlabel='Churn_Value', ylabel='count'>
tech_support_nonsenior = df5[df5.Senior_Citizen == 0].groupby(['Senior_Citizen','Churn_Value','Tech_Support'])\
.agg({'Churn_Value':'count'})
tech_support_nonsenior_percents = tech_support_nonsenior.groupby(level=0).apply(lambda x:
(x*100) / x.sum()).rename(columns={'Churn_Value':'Churn %'})
df5[df5.Senior_Citizen == 0].groupby(['Senior_Citizen','Churn_Value','Tech_Support'])\
.agg({'Churn_Value':'count'})
tech_support_nonsenior_percents
| Churn_Value | |||
|---|---|---|---|
| Senior_Citizen | Churn_Value | Tech_Support | |
| 0 | 0 | 0 | 2983 |
| 1 | 1525 | ||
| 1 | 0 | 1134 | |
| 1 | 259 |
| Churn % | |||
|---|---|---|---|
| Senior_Citizen | Churn_Value | Tech_Support | |
| 0 | 0 | 0 | 50.550754 |
| 1 | 25.843077 | ||
| 1 | 0 | 19.217082 | |
| 1 | 4.389087 |
df5.Satisfaction_Score[(df5.Senior_Citizen == 1) & (df5.Tech_Support == 1)].mean()
df5.Satisfaction_Score[(df5.Senior_Citizen == 1) & (df5.Tech_Support == 0)].mean()
3.3038461538461537
2.815192743764172
df5.Satisfaction_Score[(df5.Senior_Citizen == 0) & (df5.Tech_Support == 1)].mean()
df5.Satisfaction_Score[(df5.Senior_Citizen == 0) & (df5.Tech_Support == 0)].mean()
3.344170403587444
3.290259897983969
Fiber_optic = df5[df5.Churn_Value==1].groupby(['Senior_Citizen','Internet Service_Fiber optic'])\
.agg({'Internet Service_Fiber optic':'count'})
Fiber_optic_percents = Fiber_optic.groupby(level=0).apply(lambda x:
(x*100) / x.sum()).rename(columns={'Internet Service_Fiber optic':'Internet Service %'})
Fiber_optic
Fiber_optic_percents.style.background_gradient(cmap = 'Blues')
| Internet Service_Fiber optic | ||
|---|---|---|
| Senior_Citizen | Internet Service_Fiber optic | |
| 0 | 0 | 489 |
| 1 | 904 | |
| 1 | 0 | 83 |
| 1 | 393 |
| Internet Service % | ||
|---|---|---|
| Senior_Citizen | Internet Service_Fiber optic | |
| 0 | 0 | 35.104092 |
| 1 | 64.895908 | |
| 1 | 0 | 17.436975 |
| 1 | 82.563025 |
</div>
!jupyter nbconvert --to html Final_Project_Group6_FinalVersion.ipynb
[NbConvertApp] Converting notebook Final_Project_Group6_FinalVersion.ipynb to html [NbConvertApp] Writing 4729414 bytes to Final_Project_Group6_FinalVersion.html